lm_plotly_model <- function(DF){
plotly_plot <- DF%>%
ggplot(aes(x = (avg_sars_cov2_conc),
y = (case_rate),
color = n1_sars_cov2_lod,
fill = n2_sars_cov2_lod))+
geom_point()+
geom_smooth()+
ggtitle("Strong Linear Relationship Between Log Cases and Log COVID Concentration")+
xlab("log(Covid-19 concentration)")+
ylab("log(case rate)")
return(plotly::ggplotly(plotly_plot))#
}
num_samples <- 16000
for(A in c(1,3,5)){
X <- 2 * rnorm(num_samples) + 10
yn_exp <- rnorm(num_samples)
yn_lin <- exp(2*A)*rnorm(num_samples)
Y <- exp(X + yn_exp) + yn_lin
xn_exp <- rnorm(num_samples)
xn_lin <- exp(1*A)*rnorm(num_samples)
X <- exp(X + xn_exp)# + xn_lin
#plot(X, Y)
plot(log(X), log(Y))
}
library(Covid19Wastewater)
data(Pop_data, package = "Covid19Wastewater")
data(Covariants_data, package = "Covid19Wastewater")
#convert covariant info to most prevalent covariant to make more efficient
t_df <- Covariants_data%>%#remove info about total number of genes
select(-total_sequences)
t_df$main_variant<-colnames(t_df)[apply(t_df, 1, which.max)]#add column saying max value
covar_floor_df <- t_df%>%#add merge variable to merge weekly data to daily+ data
mutate(dumby_merge_var = floor(as.numeric(lubridate::ymd(week)) / 7),
main_variant = as.factor(main_variant))%>%
select(dumby_merge_var, main_variant)
data(Pop_data, package = "Covid19Wastewater")
data(Covariants_data, package = "Covid19Wastewater")
Graph_DF <- data.select.1%>%
left_join(rename(Pop_data, wwtp_name = site), by = c("wwtp_name"))%>%
filter(case_rate != 0,
!is.na(pop))%>%
mutate(avg_sars_cov2_conc = log(avg_sars_cov2_conc + 1),
case_rate = log(case_rate + 1),
LOD = (n1_sars_cov2_lod == "No")&(n2_sars_cov2_lod == "No"))%>%
mutate(dumby_merge_var = floor((floor(as.numeric(sample_collect_date) / 7) - 1)/2)*2 + 1 )%>%
left_join(covar_floor_df, by = c("dumby_merge_var"))%>%
select(-dumby_merge_var)
Graph_DF$pop_group <- as.factor(ntile(Graph_DF$pop, 4))
Graph_DF2 <- Graph_DF%>%
group_by(wwtp_name)%>%
mutate(avg_sars_cov2_conc = (avg_sars_cov2_conc - mean(avg_sars_cov2_conc))/ sd(avg_sars_cov2_conc),
#case_rate = exp(case_rate),
case_rate = (case_rate - mean(case_rate))/sd(case_rate))%>%
ungroup()
df_LOD <- Graph_DF%>%
filter(n1_sars_cov2_lod == "No",
n2_sars_cov2_lod == "No")
df_LOD2 <- Graph_DF2%>%
filter(n1_sars_cov2_lod == "No",
n2_sars_cov2_lod == "No")
lm_plotly_model(Graph_DF)
lm_plotly_model(Graph_DF2)
#M(t) = unknown function of T. Number of sick people at time t
#C(t) = e_l + a * M(t)*e_m + e_l
#W(t) = e_li + b * M(t)*e_e
plotly::ggplotly(
Graph_DF%>%
ggplot(aes(x = (avg_sars_cov2_conc),
y = (case_rate),
fill = pop_group,
color = LOD))+
geom_point()+
#geom_smooth(method = "lm")+
ggtitle("Strong Linear Relationship Between Log Cases and Log COVID Concentration")+
xlab("log(Covid-19 concentration)")+
ylab("log(case rate)")
)
plotly::ggplotly(
Graph_DF%>%
ggplot(aes(x = (avg_sars_cov2_conc),
y = (case_rate),
fill = main_variant,
color = LOD))+
geom_point()+
#geom_smooth(method = "lm")+
ggtitle("Strong Linear Relationship Between Log Cases and Log COVID Concentration")+
xlab("log(Covid-19 concentration)")+
ylab("log(case rate)")
)
gen_R2_params <- function(lm_formula, df){
lm_run <- lm(lm_formula, data = df)
return(c(summary(lm_run)$adj.r.squared, length(coef(lm_run))))
}
#main_variant
full_formula <- case_rate ~ avg_sars_cov2_conc:LOD:pop_group:main_variant + LOD:pop_group:main_variant
#
lm_full_inder_formula <- case_rate ~ avg_sars_cov2_conc:LOD:pop_group + avg_sars_cov2_conc:LOD:main_variant + LOD:pop_group + LOD:main_variant
#
lm_true_formula <- case_rate ~ avg_sars_cov2_conc:pop_group:main_variant + pop_group:main_variant
#
lm_true_inder_formula <- case_rate ~ avg_sars_cov2_conc:pop_group + avg_sars_cov2_conc:main_variant + pop_group + main_variant
##
lm_norm_formula <- case_rate ~ avg_sars_cov2_conc:LOD:main_variant + LOD:main_variant
##
lm_min_formula <- case_rate ~ avg_sars_cov2_conc:main_variant + main_variant
{
output_DF <- data.frame("Type" = c("ajed R^2", "num_param"),
"full model" = gen_R2_params(full_formula, Graph_DF),
"full indirect model" = gen_R2_params(lm_full_inder_formula, Graph_DF),
"true model" = gen_R2_params(lm_true_formula, df_LOD),
"true indirect model"= gen_R2_params(lm_true_inder_formula, df_LOD),
"norm site model" = gen_R2_params(lm_norm_formula, Graph_DF2),
"true norm site model" = gen_R2_params(lm_min_formula, df_LOD2))
n <- output_DF$Type
output_DF <- as.data.frame(t(output_DF[,-1]))
colnames(output_DF) <- n
output_DF
}
## ajed R^2 num_param
## full.model 0.6465076 81
## full.indirect.model 0.6339182 33
## true.model 0.6158864 41
## true.indirect.model 0.6059433 16
## norm.site.model 0.5726986 21
## true.norm.site.model 0.5315432 10
/////////////////////////////////////// // HFG Work ///////////////////////////////////////
data("HFGWaste_data", package = "Covid19Wastewater")
data(Case_data , package = "Covid19Wastewater")
#crazy agressive method
hfg_outlier_detection <- function(small_vec){
sortedVec <- sort(log(small_vec))
lower_quant <- sortedVec[4]
upper_quant <- sortedVec[6]
range <- upper_quant - lower_quant
retVec = ifelse(log(small_vec) > upper_quant + 1.5 * range,
NA, small_vec)
retVec = ifelse(log(small_vec) < lower_quant - 1.5 * range,
NA, small_vec)
retVec = ifelse(is.infinite(retVec), NA, retVec)
return(retVec)
}
Pop_DF <- data.frame(
site = c("Hudson","Kenosha","Platteville","Madison","Merrill","Plymouth","River Falls","Sun Prairie","Wausau","Oshkosh","Wausau"),
pop = c(19680,122000,14000,380000,10000,9000,16000,34926,42000,67000,42000)
)
hfg_waste_filt_df <- HFGWaste_data%>%
select(site, date, Filter, Well, N1, N2, PMMOV, HF183, CrP, everything())%>%
group_by(date, site)%>%
#mutate(across(N1:CrP, hfg_outlier_detection))%>%
left_join(Pop_DF)
trend_df <- hfg_waste_filt_df%>%
group_by(site)%>%
group_split()%>%
lapply(loessSmoothMod, InVar = "N1", OutVar = "Trend_N1")%>%
lapply(loessSmoothMod, InVar = "N2", OutVar = "Trend_N2")%>%
lapply(loessSmoothMod, InVar = "PMMOV", OutVar = "Trend_PMMOV")%>%
lapply(loessSmoothMod, InVar = "HF183", OutVar = "Trend_HF183")%>%
lapply(loessSmoothMod, InVar = "CrP", OutVar = "Trend_CrP")%>%
bind_rows()%>%
mutate(
Diff_N1 = Trend_N1 - N1,
Diff_N2 = Trend_N2 - N2,
Diff_PMMOV = Trend_PMMOV - PMMOV,
Diff_HF183 = Trend_HF183 - HF183,
Diff_CrP = Trend_CrP - CrP
)%>%
select(date, site, Filter, Well, pop, Trend_N1:Diff_CrP, N1LOD, N2LOD)
log_trend_df <- hfg_waste_filt_df%>%
mutate(log_N1 = log(N1),
log_N2 = log(N2),
log_PMMOV = log(PMMOV),
log_HF183 = log(HF183),
log_CrP = log(CrP))%>%
group_by(site)%>%
group_split()%>%
lapply(loessSmoothMod, InVar = "log_N1", OutVar = "Trend_N1")%>%
lapply(loessSmoothMod, InVar = "log_N2", OutVar = "Trend_N2")%>%
lapply(loessSmoothMod, InVar = "log_PMMOV", OutVar = "Trend_PMMOV")%>%
lapply(loessSmoothMod, InVar = "log_HF183", OutVar = "Trend_HF183")%>%
lapply(loessSmoothMod, InVar = "log_CrP", OutVar = "Trend_CrP")%>%
bind_rows()%>%
mutate(
Diff_N1 = Trend_N1 - log_N1,
Diff_N2 = Trend_N2 - log_N2,
Diff_PMMOV = Trend_PMMOV - log_PMMOV,
Diff_HF183 = Trend_HF183 - log_HF183,
Diff_CrP = Trend_CrP - log_CrP)%>%
select(site, date, Filter, Well, pop, Trend_N1:Diff_CrP, N1LOD, N2LOD)
diff_norm <- function(df){
df%>%
mutate(across(Diff_N1:Diff_CrP, ~.x - mean(.x, na.rm = TRUE)),
across(Diff_N1:Diff_CrP, ~ifelse(is.finite(.x),.x,NA)))
}
diff_var <- function(df, name){
df%>%
ungroup()%>%
summarise(across(Diff_N1:Diff_CrP, ~var(.x, na.rm = TRUE)))%>%
mutate(var_type = name)
}
gen_vars <- function(df){
trend_variance_df <- df%>%
group_by(site)%>%
diff_norm()
filter_variance_df <- df%>%
group_by(site, date)%>%
diff_norm()
well_variance_df <- df%>%
group_by(site, date, Filter)%>%
diff_norm()
bind_DF <- rbind(diff_var(trend_variance_df, "trend var"),
diff_var(filter_variance_df, "filter var"),
diff_var(well_variance_df, "well var"))%>%
pivot_longer(Diff_N1:Diff_CrP)%>%
mutate(name = factor(name, levels=c('Diff_N1', 'Diff_N2', 'Diff_PMMOV',
'Diff_HF183', 'Diff_CrP')),
var_type = factor(var_type, levels = c("trend var", "filter var",
"well var")))
levels(bind_DF$name) <- c("N1", "N2", "PMMoV", "HF183", "CrP")
return(bind_DF)
}
gen_plot_heat <- function(df, title = NA){
df%>%
ggplot(aes(x = name, y = var_type)) +
geom_tile(aes(fill = value)) +
geom_text(aes(label = round(value, 3))) +#formatC
scale_fill_gradient(low = "white", high = "red")+
scale_x_discrete(position = "top")+
xlab("signal source")+
ylab("variance source")+
ggtitle(title)
}
gen_plot_hist <- function(df, title = NA){
t_plot <- df%>%
ggplot(aes(x = name, fill = var_type))+
geom_col(aes(y = value), position="identity" ) +
xlab("signal source")+
ylab("Cumaltive Variance")+
ggtitle(title)
return(t_plot)
}
dis_df <- log_trend_df
var_output_base <- gen_vars(log_trend_df)
var_output_lod <- log_trend_df%>%
filter(N2LOD | N1LOD)%>%
gen_vars()
var_output_NLOD <- log_trend_df%>%
filter(!(N2LOD | N1LOD))%>%
gen_vars()
gen_plot_heat(var_output_base, "source of variance in HFG data")
gen_plot_heat(var_output_lod, "source of variance in above LOD info HFG data")
gen_plot_heat(var_output_NLOD, "source of variance in bellow LOD info HFG data")
gen_plot_hist(var_output_base, "source of variance in HFG data")
gen_plot_hist(var_output_lod, "source of variance in above LOD info HFG data")
gen_plot_hist(var_output_NLOD, "source of variance in bellow LOD info HFG data")
#Trend var: variance points of a day #variance of the signal
#Filter var: variance between collected points #collection variance
#well var: variance of 3 points #PCR test variance / extraction variance
#Gaussian mixture model
#change label to make more clear
#stacked in bar graph
#z = x + y
#var(z) = var(x) + var(y) if independent(
temp_func <- function(DF){
g_ret <- DF%>%
ggplot(aes(x = Diff_N1))+
geom_histogram()
return(g_ret)
}
temp_func(trend_variance_df)
temp_func(filter_variance_df)
temp_func(well_variance_df)
hist_plot <- ggplot(mapping = aes(x = Diff_N1))+
geom_histogram(data = well_variance_df, fill = "green")+
geom_histogram(data = filter_variance_df, fill = "blue")+
geom_histogram(data = trend_variance_df, fill = "red")
plotly::ggplotly(hist_plot)